{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-03-03T02:59:49.720088Z",
     "start_time": "2019-03-03T02:59:49.652122Z"
    }
   },
   "outputs": [],
   "source": [
    "\"\"\"\n",
    "数据预处理，为 FFM 做准备\n",
    "1、生成FFM所需格式的数据\n",
    "2、训练FFM\n",
    "3、调参\n",
    "4、得到预测值\n",
    "    \n",
    "\"\"\"\n",
    "\n",
    "##================ 导入必要的模块 ==================##\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "import gc\n",
    "import pickle\n",
    "\n",
    "import xlearn as xl\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-03-03T03:04:44.718777Z",
     "start_time": "2019-03-03T03:04:44.713316Z"
    }
   },
   "outputs": [],
   "source": [
    "##================ 文件路径 ==================##\n",
    "## raw data 2 (for read)\n",
    "fp_raw_train = \"feature_engineering/train_f.csv\"\n",
    "fp_raw_test  = \"feature_engineering/test_f.csv\"\n",
    "\n",
    "## label encoder for features\n",
    "fp_lb_enc = \"feature_engineering/lb_enc\"\n",
    "\n",
    "## input\n",
    "fp_train = \"fm/train_ffm.txt\"\n",
    "fp_valid = \"fm/valid_ffm.txt\"\n",
    "fp_test  = \"fm/test_ffm.txt\"\n",
    "\n",
    "## output\n",
    "fp_model_fm  = \"fm/model_fm.out\"\n",
    "fp_model_ffm = \"fm/model_ffm.out\"\n",
    "fp_pred_fm   = \"fm/output_fm.txt\"\n",
    "fp_pred_ffm  = \"fm/output_ffm.txt\"\n",
    "\n",
    "## submissions\n",
    "fp_sub_fm  = \"fm/Submission_FM.csv\"\n",
    "fp_sub_ffm = \"fm/Submission_FFM.csv\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-03-03T03:04:47.398377Z",
     "start_time": "2019-03-03T03:04:47.393291Z"
    }
   },
   "outputs": [],
   "source": [
    "##================ 准备数据 ==================##\n",
    "## feature names\n",
    "cols = ['C1',\n",
    "        'banner_pos', \n",
    "        'site_domain', \n",
    "        'site_id',\n",
    "        'site_category',\n",
    "        'app_id',\n",
    "        'app_category', \n",
    "        'device_type', \n",
    "        'device_conn_type',\n",
    "        'C14', \n",
    "        #'C15',\n",
    "        #'C16',\n",
    "        'date',\n",
    "        'time_period',\n",
    "        'weekday',\n",
    "        'C15_C16'  ]\n",
    "\n",
    "cols_train = ['id', 'click']\n",
    "cols_test  = ['id']\n",
    "cols_train.extend(cols)\n",
    "cols_test.extend(cols)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-03-03T03:05:01.192529Z",
     "start_time": "2019-03-03T03:04:53.446486Z"
    }
   },
   "outputs": [],
   "source": [
    "#----- 数据集label transform  -----#\n",
    "label_enc = pickle.load(open(fp_lb_enc, 'rb'))\n",
    "\n",
    "## train set\n",
    "df_train = pd.read_csv(fp_raw_train)\n",
    "for col in cols:\n",
    "    df_train[col] = label_enc[col].fit_transform(df_train[col].values)\n",
    "\n",
    "## test set\n",
    "df_test = pd.read_csv(fp_raw_test)\n",
    "for col in cols:\n",
    "    df_test[col] = label_enc[col].fit_transform(df_test[col].values)\n",
    "df_test['click'] = -1  # add a placeholder\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-03-03T03:05:05.319477Z",
     "start_time": "2019-03-03T03:05:04.802560Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/lxg/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py:6201: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n",
      "of pandas will change to not sort by default.\n",
      "\n",
      "To accept the future behavior, pass 'sort=True'.\n",
      "\n",
      "To retain the current behavior and silence the warning, pass sort=False\n",
      "\n",
      "  sort=sort)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "21"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "##----- 合并 train-test数据集 -----##\n",
    "n_train = len(df_train)\n",
    "n_test = len(df_test)\n",
    "df = df_train.append(df_test)   \n",
    "del df_train, df_test\n",
    "gc.collect()\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-03-03T03:14:00.186945Z",
     "start_time": "2019-03-03T03:05:09.768357Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "##-----  train/valid/test数据格式转换为libffm需要的格式 -----##\n",
    "def convert_to_ffm(df, numerics, categories, features, Label, n_train, train_size=0.5):\n",
    "    \"\"\"\n",
    "    function: generation of train/valid/test set format as libffm \n",
    "    \n",
    "    parameters:\n",
    "        df: pandas dataframe include raw data of train and test.\n",
    "        numerics: name list of numerical features.\n",
    "        categories: name list of categorical features.\n",
    "        features: name list of all features.\n",
    "        Label: name of label in the df.\n",
    "        n_train: number of training samples.\n",
    "        train_size: the ratio of train_valid split.\n",
    "    \"\"\"\n",
    "    catdict = {}\n",
    "    # Flagging categorical and numerical fields\n",
    "    for x in numerics:\n",
    "        catdict[x] = 0\n",
    "    for x in categories:\n",
    "        catdict[x] = 1\n",
    "    \n",
    "    nrows = df.shape[0]\n",
    "    \n",
    "    # samples' number of train\n",
    "    n1 = n_train * train_size\n",
    "    \n",
    "    with open(fp_train, \"w\") as file_train, \\\n",
    "         open(fp_valid, \"w\") as file_valid, \\\n",
    "         open(fp_test, \"w\")  as file_test:\n",
    "        \n",
    "        # Looping over rows to convert each row to libffm format\n",
    "        for n, r in enumerate(range(nrows)):\n",
    "            \n",
    "            datastring = \"\"\n",
    "            datarow = df.iloc[r].to_dict()\n",
    "            datastring += str(int(datarow[Label]))\n",
    "            # For  fields, we are creating a dummy field here\n",
    "            for i, x in enumerate(catdict.keys()):\n",
    "                if(catdict[x]==0):  # numerical\n",
    "                    datastring = datastring + \" \"+str(i)+\":\"+ str(i)+\":\"+ str(datarow[x])\n",
    "                else:  # categorical\n",
    "                    datastring = datastring + \" \"+str(i)+\":\"+ str(int(datarow[x]))+\":1\"\n",
    "            datastring += '\\n'\n",
    "            \n",
    "            if n < n1: file_train.write(datastring)\n",
    "            elif n < n_train: file_valid.write(datastring)\n",
    "            else: file_test.write(datastring)\n",
    "\n",
    "convert_to_ffm(df, numerics=[], categories=cols, features=cols, Label='click', n_train=n_train, train_size=0.8)\n",
    "\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-03-03T03:16:16.476845Z",
     "start_time": "2019-03-03T03:15:09.986914Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "##================ FM ==================##\n",
    "## setting\n",
    "fm_model = xl.create_fm()  # Use factorization machine\n",
    "fm_model.setTrain(fp_train)   # Training data\n",
    "fm_model.setValidate(fp_valid)  # Validation data\n",
    "fm_model.setSigmoid()\n",
    "\n",
    "param = {'task': 'binary',\n",
    "         'k': 20,\n",
    "         'lr': 0.02, \n",
    "         'lambda': 0.002,\n",
    "         'epoch': 100,\n",
    "         'opt': 'adagrad'\n",
    "         }\n",
    "\n",
    "## training\n",
    "fm_model.fit(param, fp_model_fm)\n",
    "\n",
    "## testing\n",
    "fm_model.setTest(fp_test)\n",
    "fm_model.setSigmoid()\n",
    "fm_model.predict(fp_model_fm, fp_pred_fm)\n",
    "\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-03-03T03:17:43.670717Z",
     "start_time": "2019-03-03T03:16:29.754715Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "##================ FFM ==================##\n",
    "## 训练\n",
    "ffm_model = xl.create_ffm()  # Use field-aware factorization machine\n",
    "ffm_model.setTrain(fp_train)   # Training data\n",
    "ffm_model.setValidate(fp_valid)  # Validation data\n",
    "ffm_model.setSigmoid()\n",
    "\n",
    "param = {'task': 'binary',\n",
    "         'k': 20,\n",
    "         'lr': 0.02, \n",
    "         'lambda': 0.0001,\n",
    "         'epoch': 100,\n",
    "         'opt': 'adagrad'\n",
    "         }\n",
    "\n",
    "## Train model\n",
    "ffm_model.fit(param, fp_model_ffm)\n",
    "\n",
    "## Test model\n",
    "ffm_model.setTest(fp_test)\n",
    "ffm_model.setSigmoid()\n",
    "ffm_model.predict(fp_model_ffm, fp_pred_ffm)\n",
    "\n",
    "gc.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-03-03T03:18:13.975995Z",
     "start_time": "2019-03-03T03:17:58.116749Z"
    }
   },
   "outputs": [],
   "source": [
    "##================ 生成提交数据 ==================##\n",
    "#----- fm -----#\n",
    "y_pred_fm = np.loadtxt(fp_pred_fm)\n",
    "df_test = pd.read_csv(fp_raw_test, dtype={'id':str})\n",
    "df_test['click'] = y_pred_fm\n",
    "\n",
    "with open(fp_sub_fm, 'w') as f: \n",
    "    df_test.to_csv(f, columns=['id', 'click'], header=True, index=False)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2019-03-03T03:19:42.748842Z",
     "start_time": "2019-03-03T03:19:30.238163Z"
    }
   },
   "outputs": [],
   "source": [
    "#----- ffm -----#\n",
    "y_pred_ffm = np.loadtxt(fp_pred_ffm)\n",
    "df_test = pd.read_csv(fp_raw_test, dtype={'id':str})\n",
    "df_test['click'] = y_pred_ffm\n",
    "\n",
    "with open(fp_sub_ffm, 'w') as f: \n",
    "    df_test.to_csv(f, columns=['id', 'click'], header=True, index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
